{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用CoreML训练垃圾短信过滤"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.20.2'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sklearn\n",
    "sklearn.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from io import open\n",
    "import matplotlib.pyplot as plt\n",
    "import csv\n",
    "import pandas\n",
    "import sklearn\n",
    "import pickle\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import SVC, LinearSVC\n",
    "from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.tree import DecisionTreeClassifier \n",
    "import string\n",
    "plt.style.use('ggplot')\n",
    "# pd.options.display.mpl_style = 'default'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据\n",
    "\n",
    "数据来源: [wandouqiang/RubbishMessage](https://github.com/wandouqiang/RubbishMessage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-01-04 16:30:47--  https://raw.githubusercontent.com/wandouqiang/RubbishMessage/master/data/80w.txt\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.72.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.72.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 64501011 (62M) [text/plain]\n",
      "Saving to: ‘data/80w.txt’\n",
      "\n",
      "data/80w.txt        100%[===================>]  61.51M   663KB/s    in 2m 36s  \n",
      "\n",
      "2019-01-04 16:33:26 (404 KB/s) - ‘data/80w.txt’ saved [64501011/64501011]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!mkdir -p data && wget -c https://raw.githubusercontent.com/wandouqiang/RubbishMessage/master/data/80w.txt -O data/80w.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "共有800000条信息\n",
      "0 1\t0\t商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一\n",
      "1 2\t1\t南口阿玛施新春第一批限量春装到店啦         春暖花开淑女裙、冰蓝色公主衫   气质粉小西装、冰丝女王长半裙、   皇\n",
      "2 3\t0\t带给我们大常州一场壮观的视觉盛宴\n",
      "3 4\t0\t有原因不明的泌尿系统结石等\n",
      "4 5\t0\t23年从盐城拉回来的麻麻的嫁妆\n",
      "5 6\t0\t感到自减肥、跳减肥健美操、\n",
      "6 7\t1\t感谢致电杭州萧山全金釜韩国烧烤店，本店位于金城路xxx号。韩式烧烤等，价格实惠、欢迎惠顾【全金釜韩国烧烤店】\n",
      "7 8\t0\t这款UVe智能杀菌机器人是扫地机的最佳伴侣\n",
      "8 9\t1\t一次价值xxx元王牌项目；可充值xxx元店内项目卡一张；可以参与V动好生活百分百抽奖机会一次！预约电话：xxxxxxxxxxx\n",
      "9 10\t0\t此类皮肤特别容易招惹粉刺、黑头等\n"
     ]
    }
   ],
   "source": [
    "file_80w = './data/80w.txt'\n",
    "with open(file_80w) as f:\n",
    "    corpus = f.readlines()\n",
    "\n",
    "print(\"共有{}条信息\".format(len(corpus)))\n",
    "\n",
    "corpus = [x.strip() for x in corpus]\n",
    "\n",
    "for i,message in enumerate(corpus[:10]):\n",
    "    print(i, message)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>南口阿玛施新春第一批限量春装到店啦         春暖花开淑女裙、冰蓝色公主衫  ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>带给我们大常州一场壮观的视觉盛宴</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>有原因不明的泌尿系统结石等</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>23年从盐城拉回来的麻麻的嫁妆</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                            message\n",
       "1      0                      商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一\n",
       "2      1  南口阿玛施新春第一批限量春装到店啦         春暖花开淑女裙、冰蓝色公主衫  ...\n",
       "3      0                                   带给我们大常州一场壮观的视觉盛宴\n",
       "4      0                                      有原因不明的泌尿系统结石等\n",
       "5      0                                    23年从盐城拉回来的麻麻的嫁妆"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(file_80w, encoding='UTF-8') as f:\n",
    "    messages = pd.read_csv(f, sep='\\t', quoting=csv.QUOTE_NONE, names=['label', 'message'])\n",
    "\n",
    "messages['message'] = messages['message'].map(lambda text:text)\n",
    "messages.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"4\" halign=\"left\">message</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>unique</th>\n",
       "      <th>top</th>\n",
       "      <th>freq</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>720000</td>\n",
       "      <td>707465</td>\n",
       "      <td>#NAME?</td>\n",
       "      <td>61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>80000</td>\n",
       "      <td>79146</td>\n",
       "      <td>平安易贷可帮您获取所需资金，无须抵押，手续简便快速，快来申请吧！，在申请时输入我的邀请码xx...</td>\n",
       "      <td>92</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      message                                                                \n",
       "        count  unique                                                top freq\n",
       "label                                                                        \n",
       "0      720000  707465                                             #NAME?   61\n",
       "1       80000   79146  平安易贷可帮您获取所需资金，无须抵押，手续简便快速，快来申请吧！，在申请时输入我的邀请码xx...   92"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messages.groupby('label').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>message</th>\n",
       "      <th>length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>南口阿玛施新春第一批限量春装到店啦         春暖花开淑女裙、冰蓝色公主衫  ...</td>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>带给我们大常州一场壮观的视觉盛宴</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>有原因不明的泌尿系统结石等</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>23年从盐城拉回来的麻麻的嫁妆</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                            message  length\n",
       "1      0                      商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一      29\n",
       "2      1  南口阿玛施新春第一批限量春装到店啦         春暖花开淑女裙、冰蓝色公主衫  ...      67\n",
       "3      0                                   带给我们大常州一场壮观的视觉盛宴      16\n",
       "4      0                                      有原因不明的泌尿系统结石等      13\n",
       "5      0                                    23年从盐城拉回来的麻麻的嫁妆      15"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messages['length'] = messages['message'].map(lambda text:len(text))\n",
    "messages.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x11139b828>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaQAAAD8CAYAAAA45tAbAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAHEVJREFUeJzt3X+QnVWd5/F32y2IOpjAHTCdZCdh6EUZa8CBClmZdRiQkCCTMFv4FdchDZMlVfgLd6wawcHJLjBunLXE1BZmTQJDx7IMn0IcusZgNhVk3a0CRKKzrLDOBIyk6ZjYJERm2BUT7/7xnCY3nds/7s3t26f7+byqbvV9znPOc55vX8K3z3nOfZ6OarWKmZnZVHvDVJ+AmZkZOCGZmVkmnJDMzCwLTkhmZpYFJyQzM8uCE5KZmWXBCcnMzLLghGRmZllwQjIzsyx0TfUJZMq3rzAza05Hsw2dkEYxODjYVLtKpcLQ0FCLz2Z6cOzljB3KHb9jPxp7d3f3CR3PU3ZmZpYFJyQzM8uCE5KZmWXBCcnMzLLghGRmZllwQjIzsyw4IZmZWRackMzMLAtOSGZmlgXfqaHF9v3xe5pu27mxv4VnYmY2vbQlIUXEOcD9NUVnAX8JbE7lC4DdQEg6GBEdwDrgSuBV4HpJO9OxeoHb0nHulNSXyi8A7gNOAbYCN0uqRsRp9fqYpFDNzKxJbZmyk/RjSedLOh+4gCLJfBO4BdghqQfYkbYBlgE96bUaWA+Qkssa4CJgEbAmImanNutT3eF2S1P5aH2YmVlGpuIa0mXAc5J+CqwA+lJ5H3B1er8C2CypKulxYFZEzAGuALZLOpBGOduBpWnfqZIek1SlGHnVHqteH2ZmlpGpSEjXAl9P78+UtBcg/Twjlc8F9tS0GUhlY5UP1Ckfqw8zM8tIWxc1RMRJwHLg1nGq1nueRrWJ8kbObTXFlB+SqFQqjTR/3b6mWhWa7TMXXV1d0z6GZpU5dih3/I69dbG3e5XdMmCnpOH/b++LiDmS9qZpt/2pfACYX9NuHjCYyi8ZUf5oKp9Xp/5YfRxD0gZgQ9qsTsXzTab7M1X8XJhyxg7ljt+xT9/nIX2Io9N1AP1Ab3rfCzxUU74yIjoiYjFwKE23bQOWRMTstJhhCbAt7XslIhanFXorRxyrXh9mZpaRtiWkiHgzcDnwYE3xWuDyiPjHtG9tKt8KPA/sAjYCHwGQdAC4A3gyvW5PZQA3AZtSm+eAh8fpw8zMMtJRrTZ0qaUsqs0+wvzIjcub7nS6fzHWUxfljB3KHb9jP27Krt41/QnxrYPMzCwLTkhmZpYFJyQzM8uCE5KZmWXBCcnMzLLghGRmZllwQjIzsyw4IZmZWRackMzMLAtOSGZmlgUnJDMzy4ITkpmZZcEJyczMsuCEZGZmWXBCMjOzLDghmZlZFpyQzMwsC05IZmaWBSckMzPLghOSmZlloatdHUXELGAT8C6gCvwp8GPgfmABsBsISQcjogNYB1wJvApcL2lnOk4vcFs67J2S+lL5BcB9wCnAVuBmSdWIOK1eH5MbrZmZNaqdI6R1wLclvQM4D3gWuAXYIakH2JG2AZYBPem1GlgPkJLLGuAiYBGwJiJmpzbrU93hdktT+Wh9mJlZRtqSkCLiVOC9wD0Akl6T9DKwAuhL1fqAq9P7FcBmSVVJjwOzImIOcAWwXdKBNMrZDixN+06V9JikKrB5xLHq9WFmZhlp15TdWcDPgb+JiPOAp4CbgTMl7QWQtDcizkj15wJ7atoPpLKxygfqlDNGH8eIiNUUIywkUalUmgp0X1OtCs32mYuurq5pH0Ozyhw7lDt+x9662NuVkLqA3wM+LumJiFjH2FNnHXXKqk2UT5ikDcCG4bZDQ0ONNG+JqeizlSqVyrSPoVlljh3KHb9jPxp7d3f3CR2vXdeQBoABSU+k7QcoEtS+NN1G+rm/pv78mvbzgMFxyufVKWeMPszMLCNtSUiSfgbsiYhzUtFlwDNAP9CbynqBh9L7fmBlRHRExGLgUJp22wYsiYjZaTHDEmBb2vdKRCxOK/RWjjhWvT7MzCwjbVv2DXwc+FpEnAQ8D9xAkRAVEauAF4APpLpbKZZ876JY9n0DgKQDEXEH8GSqd7ukA+n9TRxd9v1wegGsHaUPMzPLSEe12tCllrKoDg4Ojl+rjiM3Lm+6086N/U23zYHn0ssZO5Q7fsd+3DWketf0J8R3ajAzsyw4IZmZWRackMzMLAtOSGZmlgUnJDMzy4ITkpmZZcEJyczMsuCEZGZmWXBCMjOzLDghmZlZFpyQzMwsC05IZmaWBSckMzPLghOSmZllwQnJzMyy4IRkZmZZcEIyM7MsOCGZmVkWnJDMzCwLTkhmZpaFrnZ1FBG7gVeAI8BhSRdGxGnA/cACYDcQkg5GRAewDrgSeBW4XtLOdJxe4LZ02Dsl9aXyC4D7gFOArcDNkqqj9THJ4ZqZWYPaPUL6Q0nnS7owbd8C7JDUA+xI2wDLgJ70Wg2sB0jJZQ1wEbAIWBMRs1Ob9anucLul4/RhZmYZmeopuxVAX3rfB1xdU75ZUlXS48CsiJgDXAFsl3QgjXK2A0vTvlMlPSapCmwecax6fZiZWUbaNmUHVIH/FhFV4CuSNgBnStoLIGlvRJyR6s4F9tS0HUhlY5UP1ClnjD6OERGrKUZYSKJSqTQV5L6mWhWa7TMXXV1d0z6GZpU5dih3/I69dbG3MyFdLGkwJYTtEfF/xqjbUaes2kT5hKUEuWG47dDQUCPNW2Iq+mylSqUy7WNoVpljh3LH79iPxt7d3X1Cx2vblJ2kwfRzP/BNimtA+9J0G+nn/lR9AJhf03weMDhO+bw65YzRh5mZZaQtCSki3hIRvzH8HlgC/G+gH+hN1XqBh9L7fmBlRHRExGLgUJp22wYsiYjZaTHDEmBb2vdKRCxOK/RWjjhWvT7MzCwj7RohnQn8z4j4e+B7wLckfRtYC1weEf8IXJ62oVi2/TywC9gIfARA0gHgDuDJ9Lo9lQHcBGxKbZ4DHk7lo/VhZmYZ6ahWG7rUUhbVwcHB8WvVceTG5U132rmxv+m2OfBcejljh3LH79iPu4ZU75r+hEz1sm8zMzPACcnMzDIx4YQUEZ+IiHIutjczs0nXyPeQ3gd8LiIeBb4K/K2kX07KWZmZWelMeIQkaTnwWxSr1z4J/CwiNkXEeyfr5MzMrDwaulODpJeAu4G7I+J3KUZKN0TEHorl2esk/VPrT9PMzGa6hm8dFBGXAX9CcdPS7wN/DbwA3EwxevrXrTxBMzMrhwknpIj4AnAtcIjibtq3SXqxZv/jgJ8zZGZmTWlkhPQm4I8lPVlvp6RfRcSF9faZmZmNp5GE9J8ont76unQ/uVNqbpw61h28zczMRtXIF2P/lmPvqE3a/mbrTsfMzMqqkYR0jqSnawvS9jtae0pmZlZGjSSk/RFxdm1B2n6ptadkZmZl1Mg1pHuBb0TEX1A8GuK3KR4FsWkyTszMzMqlkYS0FvgV8AWKp7buoUhGX5yE8zIzs5KZcEKS9GvgP6eXmZlZSzV0p4aIOAc4D3hrbbmke1t5UmZmVj6N3KnhM8BfAn/Psd9HqlJcXzIzM2taIyOkTwKLJP2vyToZMzMrr0aWff9fwHdiMDOzSdHICOmzwH+JiP8A7KvdkRY8jCsiOinuEP6ipKsiYiGwBTgN2AlcJ+m1iDiZ4gauF1B8z+mDknanY9wKrAKOAJ+QtC2VLwXWAZ3AJklrU3ndPhqI28zM2qCREdJ9wI3AAMXy718Bh9PPiboZeLZm+/PAXZJ6KO4UviqVrwIOSjobuCvVIyLOpbjj+O8AS4EvR0RnSnR3A8uAc4EPpbpj9WFmZhlpJCEtTK+zal7D2+OKiHnA+0lfpI2IDuBS4IFUpQ+4Or1fkbZJ+y9L9VcAWyT9UtJPgF3AovTaJen5NPrZAqwYpw8zM8tII99D+ilARLwBOFPS3gb7+hLw58BvpO3TgZclHU7bA8Dc9H4uxRdvkXQ4Ig6l+nOBx2uOWdtmz4jyi8bp4xgRsRpYnfqkUqk0GF5h3/hVRtVsn7no6uqa9jE0q8yxQ7njd+yti72RZd+zgC8D11BM070lIpZTrLy7bZy2VwH7JT0VEZek4o46Vavj7ButvN5Ib6z6x5G0AdgwXGdoaKhetUk1FX22UqVSmfYxNKvMsUO543fsR2Pv7u4+oeM1MmX3XymeFvtbwPCigMeAD06g7cXA8ojYTTGddinFiGlWRAwnxXnAYHo/QHF7ItL+twEHastHtBmtfGiMPszMLCONJKTLKFa17SWNMiT9HDhjvIaSbpU0T9ICikUJj0j6MPAdihEXQC/wUHrfn7ZJ+x+RVE3l10bEyWn1XA/wPeBJoCciFkbESamP/tRmtD7MzCwjjSSkQ8Axk4UR8S+ARq8l1fo08GcRsYvies89qfwe4PRU/mfALQCSfgQIeAb4NvBRSUfSNaKPAdsoVvEp1R2rDzMzy0hHtVr3kspxIuIWYDnwFxRPiV0GfA54SNKXJu0Mp0Z1cLC5mb0jNy5vutPOjf1Nt82B59LLGTuUO37Hftw1pHrX7iekkS/Gfh74fxTf93kjxf3rvkLxZVQzM7MT0siy7yrFQoSZNhoyM7MMNLLs+9LR9kl6pDWnY2ZmZdXIlN3IxQC/CZxEseR6QndrMDMzG00jU3YLa7fT/eNuA15p9UmZmVn5NLLs+xiSjgB/RXE7IDMzsxPSdEJKLgcm9OgJMzOzsTSyqGEPx94H7s3Am4CPtPqkzMysfBpZ1PAnI7b/GfgHSb9o4fmYmVlJNbKo4b9P5omYmVm5NTJl91VGeXRDLUkrT+iMzMyslBpZ1PAyxdNWOym+e/QGiie4vgw8V/MyMzNrWCPXkP4l8H5J/2O4ICJ+H/ispCtafmZmZlYqjYyQFnPs48MBngD+VetOx8zMyqqRhPQD4HMRcQpA+vlXwA8n48TMzKxcGklI11M8ivxQROyjeGDf73P0ya5mZmZNa2TZ927gPRExH+gG9kp6YbJOzMzMyqWhWwdFxOnAJcAfSHohIrojYt6knJmZmZXKhBNSRPwB8GPgw8BnU3EPsH4SzsvMzEqmkWXfXwI+KGlHRBxMZU8Ai8ZrGBFvAr4LnJz6fEDSmohYCGwBTgN2AtdJei0iTgY2AxcAL6V+d6dj3QqsAo4An5C0LZUvpXiceiewSdLaVF63jwbiNjOzNmhkym6BpB3p/fAdG15jYkntl8Clks4DzgeWRsRi4PPAXZJ6gIMUiYb086Cks4G7Uj0i4lzgWuB3gKXAlyOiMz2b6W5gGXAu8KFUlzH6MDOzjDSSkJ6JiJFfgH0f8PR4DSVVJf1T2nxjelWBS4EHUnkfxZ0goLgDRF96/wBwWUR0pPItkn4p6SfALooR2iJgl6Tn0+hnC7AitRmtDzMzy0gjU3afAv4uIr4FnBIRXwH+iCJJjCuNYp4CzqYYzTwHvCzpcKoyAMxN7+cCewAkHY6IQ8Dpqbz2y7m1bfaMKL8otRmtDzMzy0gjy74fj4jfpXgMxb0UCWCRpIEJtj8CnB8Rs4BvAu+sU214KrBjlH2jldcb6Y1V/zgRsRpYnc6VSqVSr9q49jXVqtBsn7no6uqa9jE0q8yxQ7njd+yti31CCSmNbnYAV0j66xPpUNLLEfEoxa2IZkVEVxrBzAMGU7UBYD4wEBFdwNuAAzXlw2rb1CsfGqOPkee1AdiQNqtDQ0MnEmZTpqLPVqpUKtM+hmaVOXYod/yO/Wjs3d3dJ3S8CV1DSqObhROtP1JE/GYaGQ3fcuh9wLPAd4BrUrVe4KH0vp+jd4C4BnhEUjWVXxsRJ6fVcz3A94AngZ6IWBgRJ1EsfOhPbUbrw8zMMtLINaT/CKyPiDUUI5XXp74k/XqctnOAvjTSekPRRH8XEc8AWyLiTop75d2T6t8DfDUidlGMjK5N/fwoIgQ8AxwGPpqSJRHxMWAbxbLveyX9KB3r06P0YWZmGemoVsd95h4AEVGbdGqv9VQldbb6xKZYdXCw7szeuI7cuLzpTjs39jfdNgeeuihn7FDu+B37cVN29a7dT8i4U3AR8fb0dmHN66z0Gn5vZmZ2QiYyZfcPwKmSfgoQEQ9K+jeTe1pmZlY2E1mkMHL4dckknIeZmZXcRBLSxC4ymZmZnYCJTNl1RcQfcnSkNHIbSY9MxsmZmVl5TCQh7ae4M8Owl0ZsV/HCBjMzO0HjJiRJC9pwHmZmVnJN3XnBzMys1ZyQzMwsC05IZmaWBSckMzPLghOSmZllwQnJzMyy4IRkZmZZcEIyM7MsOCGZmVkWnJDMzCwLTkhmZpYFJyQzM8uCE5KZmWVhIo+fOGERMR/YDLwd+DWwQdK6iDgNuB9YAOwGQtLBiOgA1gFXAq8C10vamY7VC9yWDn2npL5UfgFwH3AKsBW4WVJ1tD4mOWQzM2tQu0ZIh4FPSXonsBj4aEScC9wC7JDUA+xI2wDLgJ70Wg2sB0jJZQ1wEbAIWBMRs1Ob9anucLulqXy0PszMLCNtSUiS9g6PcCS9AjwLzAVWAH2pWh9wdXq/AtgsqSrpcWBWRMwBrgC2SzqQRjnbgaVp36mSHpNUpRiN1R6rXh9mZpaRtl9DiogFwLuBJ4AzJe2FImkBZ6Rqc4E9Nc0GUtlY5QN1yhmjDzMzy0hbriENi4i3At8APinpFxExWtWOOmXVJsobObfVFFN+SKJSqTTS/HX7mmpVaLbPXHR1dU37GJpV5tih3PE79tbF3raEFBFvpEhGX5P0YCreFxFzJO1N0277U/kAML+m+TxgMJVfMqL80VQ+r079sfo4hqQNwIa0WR0aGmo8yBM0FX22UqVSmfYxNKvMsUO543fsR2Pv7u4+oeO1ZcourZq7B3hW0hdrdvUDvel9L/BQTfnKiOiIiMXAoTTdtg1YEhGz02KGJcC2tO+ViFic+lo54lj1+jAzs4y0a4R0MXAd8HRE/DCVfQZYCygiVgEvAB9I+7ZSLPneRbHs+wYASQci4g7gyVTvdkkH0vubOLrs++H0Yow+zMwsIx3VakOXWsqiOjg4OH6tOo7cuLzpTjs39jfdNgeeuihn7FDu+B37cVN29a7pT4jv1GBmZllwQjIzsyw4IZmZWRackMzMLAtOSGZmlgUnJDMzy4ITkpmZZcEJyczMsuCEZGZmWXBCMjOzLDghmZlZFpyQzMwsC05IZmaWBSckMzPLghOSmZllwQnJzMyy4IRkZmZZcEIyM7MsOCGZmVkWnJDMzCwLXe3oJCLuBa4C9kt6Vyo7DbgfWADsBkLSwYjoANYBVwKvAtdL2pna9AK3pcPeKakvlV8A3AecAmwFbpZUHa2PSQ7XzMya0K4R0n3A0hFltwA7JPUAO9I2wDKgJ71WA+vh9QS2BrgIWASsiYjZqc36VHe43dJx+jAzs8y0JSFJ+i5wYETxCqAvve8Drq4p3yypKulxYFZEzAGuALZLOpBGOduBpWnfqZIek1QFNo84Vr0+zMwsM1N5DelMSXsB0s8zUvlcYE9NvYFUNlb5QJ3ysfowM7PMtOUaUoM66pRVmyhvSESsppj2QxKVSqXRQwCwr6lWhWb7zEVXV9e0j6FZZY4dyh2/Y29d7FOZkPZFxBxJe9O02/5UPgDMr6k3DxhM5ZeMKH80lc+rU3+sPo4jaQOwIW1Wh4aGmgrqRExFn61UqVSmfQzNKnPsUO74HfvR2Lu7u0/oeFM5ZdcP9Kb3vcBDNeUrI6IjIhYDh9J02zZgSUTMTosZlgDb0r5XImJxWqG3csSx6vVhZmaZadey769TjG4qETFAsVpuLaCIWAW8AHwgVd9KseR7F8Wy7xsAJB2IiDuAJ1O92yUNL5S4iaPLvh9OL8bow8zMMtNRrTZ8uaUMqoODg+PXquPIjcub7rRzY3/TbXPgqYtyxg7ljt+xHzdlV++6/oT4Tg1mZpaFHFfZlVaZR1dmZh4hmZlZFpyQzMwsC05IZmaWBSckMzPLghOSmZllwQnJzMyy4IRkZmZZcEIyM7MsOCGZmVkWnJDMzCwLTkhmZpYFJyQzM8uCE5KZmWXBCcnMzLLghGRmZllwQjIzsyw4IZmZWRackMzMLAuleIR5RCwF1gGdwCZJa6f4lMzMbIQZP0KKiE7gbmAZcC7woYg4d2rPyszMRprxCQlYBOyS9Lyk14AtwIopPiczMxuhDFN2c4E9NdsDwEVTdC6T5siNy6es786N/VPWt5nNHGVISB11yqojCyJiNbAaQBLd3d3N9fat7zfXboZo+vc2A5Q5dih3/I69NcowZTcAzK/ZngcMjqwkaYOkCyVdSJHEmnpFxFMn0n46vxz71J+H43fsGcTetDKMkJ4EeiJiIfAicC3wb6f2lMzMbKQZP0KSdBj4GLANeLYo0o+m9qzMzGykMoyQkLQV2Nqm7ja0qZ8cOfbyKnP8jr1FOqrV467vm5mZtd2Mn7IzM7PpoRRTdu0w029PFBHzgc3A24FfAxskrYuI04D7gQXAbiAkHYyIDorfx5XAq8D1knZOxbm3Srrrx/eBFyVdlRbKbAFOA3YC10l6LSJOpvhdXQC8BHxQ0u4pOu2WiIhZwCbgXRRfm/hT4MeU4LOPiH8P/DuKuJ8GbgDmMEM/+4i4F7gK2C/pXams4X/nEdEL3JYOe6ekvvH69gipBUpye6LDwKckvRNYDHw0xXgLsENSD7AjbUPxu+hJr9XA+vafcsvdTLEwZtjngbtS7AeBVal8FXBQ0tnAXanedLcO+LakdwDnUfweZvxnHxFzgU8AF6b/OXdSrNSdyZ/9fcDSEWUNfdYpga2huAnBImBNRMwer2MnpNaY8bcnkrR3+C8fSa9Q/A9pLkWcw3/59AFXp/crgM2SqpIeB2ZFxJw2n3bLRMQ84P0UowTSX4aXAg+kKiNjH/6dPABclupPSxFxKvBe4B4ASa9JepmSfPYUM0mnREQX8GZgLzP4s5f0XeDAiOJGP+srgO2SDkg6CGzn+CR3HCek1qh3e6K5U3Quky4iFgDvBp4AzpS0F4qkBZyRqs2038mXgD+nmK4EOB14OX2tAI6N7/XY0/5Dqf50dRbwc+BvIuIHEbEpIt5CCT57SS8CXwBeoEhEh4CnKM9nP6zRz7qp/wackFqj3l9AM3L5YkS8FfgG8ElJvxij6oz5nUTE8Hz6UzXFY8U3Y2JPuoDfA9ZLejfwzxydsqlnxsSfpplWAAuBbuAtFNNUI83Uz348o8Xb1O/BCak1JnR7oukuIt5IkYy+JunBVLxveDom/dyfymfS7+RiYHlE7KaYjr2UYsQ0K03jwLHxvR572v82jp8CmU4GgAFJT6TtBygSVBk++/cBP5H0c0m/Ah4E3kN5PvthjX7WTf034ITUGq/fnigiTqK46DmjboGd5sHvAZ6V9MWaXf1Ab3rfCzxUU74yIjoiYjFwaHjIP91IulXSPEkLKD7bRyR9GPgOcE2qNjL24d/JNan+tP0rWdLPgD0RcU4qugx4hhJ89hRTdYsj4s3p38Bw7KX47Gs0+llvA5ZExOw0ylySysbkZd8tIOlwRAzfnqgTuHcG3p7oYuA64OmI+GEq+wywFlBErKL4x/uBtG8rxVLQXRTLQW9o7+m2xaeBLRFxJ/AD0kX/9POrEbGL4q/ja6fo/Frp48DX0h9cz1N8nm9ghn/2kp6IiAcolnYfpvicNwDfYoZ+9hHxdeASoBIRAxSr5Rr6dy7pQETcQfHHOsDtksYdKfpODWZmlgVP2ZmZWRackMzMLAtOSGZmlgUnJDMzy4ITkpmZZcEJyczMsuCEZGZmWXBCMjOzLPx/oil0iE9yOyQAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "messages.length.plot(bins=20, kind='hist')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    800000.000000\n",
       "mean         25.663336\n",
       "std          19.480775\n",
       "min           1.000000\n",
       "25%          15.000000\n",
       "50%          19.000000\n",
       "75%          27.000000\n",
       "max         962.000000\n",
       "Name: length, dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messages.length.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([<matplotlib.axes._subplots.AxesSubplot object at 0x112805400>,\n",
       "       <matplotlib.axes._subplots.AxesSubplot object at 0x11307ecf8>],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAEXCAYAAACQ3VJYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3X+UnFWd5/F3mQbHFfnZEvIDN7hGR2DXuHCAMxw9UQQDgwTn4FfCHhKQJaJhxMFz5Md4BhXwhB2Rza7KmEAmyRwlfAZFshoIWQRZV8KvyCgQHUOI0iQkhATEYQcmofaP5xZdqa7uru6qp+qp7s/rnDpddes+T92nu6q+fZ/7fe4tlctlzMzMWu1NnW6AmZmNTQ4wZmaWCwcYMzPLhQOMmZnlwgHGzMxy4QBjZma5cIAxM7NcOMCMQ6VS6bRSqfRYqVR6tVQqbS6VSpd2uk1mnVAqlT5YKpXuKJVKvyuVSuVSqfSlTrdpLHGAGWdKpdKxwB3AXcAM4MvA10ql0kWdbJdZh+wHPAl8EXiuw20Zc0q+kn98KZVK3wOmlcvlP6sq+1vgrHK5fETnWmbWWaVSaTNwU7lcvqbTbRkr3IMZf04k671UuwuYViqVpnagPWY2RjnAjD+TGHgq4Lmq58zMWsIBxqr5fKmZtYwDzPizFTispmxi+ulBTjNrGQeY8ef/Ah+tKZsF/K5cLvd1oD1mNkb1dLoB1nY3AD8vlUrXAv8AHAf8JfBXHW2VWQeUSqX9gHelh/sCh5VKpRnAH8vl8sbOtWxscJryOFQqlf4c+Brwp2SnxRaVy+VvdLZVZu1XKpVmAvfWeeqn5XJ5ZntbM/Y4wJiZWS48BmNmZrlwgDEzs1w4wJiZWS6GzSKLiMOBFWTXTrwOLJa0KCIOBm4FpgGbgZC0KyJKwCLgNOAV4DxJ69O+5gGV2UqvkbQ8lR8DLAPeAqwGLpFUHuw1mj5qMzPLXSM9mN3AFyS9FzgBWBARRwKXA/dImg7ckx4DnApMT7f5wI0AKVhcBRxPlhp7VUQclLa5MdWtbDcrlQ/2GmZmVnDD9mAkbSW7+htJL0fEBmAKMBuYmaotB+4DLkvlKySVgXURcWBETEp110raCRARa4FZEXEfsL+kB1L5CuBM4M4hXmMoTouzvJQ63YAR8mfB8jTs52FEF1pGxDTg/cCDwMQUfJC0NSIOTdWmAM9UbdaXyoYq76tTzhCvUduu+WQ9ICTx2muvDajT09PD7t27Gz7WbuXjzMe+++7bttdqpS1bttQt7+3tZceOHW1uzeCK1J4itQWK1Z5KWyZPntxQ/YYDTETsB3wf+LykP0TEYFXrRbXyKMobJmkxsLiybb0/RpH+SHnyceaj0Q+UmfVrKIssIvYhCy7flfSDVLwtnfoi/dyeyvuAw6s2nwpsGaZ8ap3yoV7DzMwKbtgAk7LCbgY2SKqeTmQVMC/dn0e2DG+lfG5ElCLiBOCldJprDXBKRByUBvdPAdak516OiBPSa82t2Ve91zAzs4Jr5BTZicC5wK8i4rFUdiWwEFBEXAD8HvhEem41WYryRrI05fMBJO2MiKuBh1O9r1YG/IHP0J+mfGe6McRrmJlZwY3FucjK9QY2PTYxtnRoDKbrssg8yD9yRWoLFKs9NYP8w34efCW/mZnlwgHGzMxy4QBjZma58IqWDdhz4RkATFiyqsMtMWuPynse/L630XMPxszMcuEAY2ZmuXCAMTOzXDjAmJlZLhxgzMwsFw4wZmaWC6cpm7VIRGwGXgb2ALslHduOpcXbcnBmo+AejFlrfUjSDEnHpsftWFrcrJAcYMzyNZtsuW/SzzOryldIKktaB1SWFv8oaWlxSbuAytLik0hLi6dey4qqfZkVkgOMWeuUgbsj4tG0jDfULPsN5LG0uFkheQzGrHVOlLQlIg4F1kbEr4eom8vS4imwzQeQRG9vb90X7+npGfQ5gG1V94eq1yrDtaeditQWKFZ7RtoWBxizFpG0Jf3cHhG3k42hbIuISZK2jmBp8Zk15fcx9NLi1W1YDCxOD8uDrSMykjVG2rEWSRHXPCmKIrWnZj2YYfkUmVkLRMRbI+JtlftkS4I/TnuWFjcrJAcYs9aYCPwsIv4JeAj4saS7yJb9PjkifgucnB5Dlma8iWxp8SXAZyFbWhyoLC3+MAOXFr8pbfMU/UuLmxXSsKfIImIpcDqwXdLRqexW4D2pyoHAi5JmRMQ0YAPwm/TcOkkXpW3q5vCP5joBs6KRtAl4X53yF4CT6pSXgQWD7GspsLRO+SPA0U031qxNGhmDWQZ8kywtEgBJn6zcj4jrgZeq6j8laUad/VRy+NeRBZhZZP+BVa4TWBgRl6fHl7H3dQLHp+2Pb/TAzMyss4Y9RSbpfmBnvedSLyOAW4baxzA5/CO9TsDMzLpAs1lkHwC2SfptVdkREfEL4A/AlyT9H4bO4d/rOoGU4gmDXw+wtbYRjaRmNpPqV0nZLEqq4FCKlNKYp/FynGbdrNkAM4e9ey9bgXdIeiGNufwwIo6iwRz+Gg1v00hqZitS/YqSKjiUIqU05qndx9loWqaZ9Rt1FllE9AB/QTZAD4CkV9OgJpIeJct0eTdD5/Bvq5z6avA6ATMz6wLNpCl/BPi1pDdOfUXE2yNiQrr/TrIB+k3D5PCP9DoBMzPrAsMGmIi4BXgAeE9E9EXEBempsxk4uP9B4JfpWoDbgIsayOEf0XUCZmbWHUrl8phbTqK8ZcvAM2nNnLPfc+EZAExYsqqphrWDx2DykcZg6o0LFlndzwIM//urvOehPe/7Ir1vi9QWKFZ7aqaKGfbz4Cv5zcwsFw4wZmaWCwcYMzPLhQOMmZnlwgHGzMxy4QBjZma5cIAxM7NcOMCYmVkuHGDMzCwXDjBmZpYLBxgzM8uFA4yZmeXCAcbMzHLhAGNmZrlwgDEzs1w4wJiZWS4cYMzMLBc9w1WIiKXA6cB2SUensi8DFwLPp2pXSlqdnrsCuADYA3xO0ppUPgtYBEwAbpK0MJUfAawEDgbWA+dKei0i3gysAI4BXgA+KWlzC47ZzMzaoJEezDJgVp3yGyTNSLdKcDkSOBs4Km3z7YiYEBETgG8BpwJHAnNSXYDr0r6mA7vIghPp5y5J7wJuSPXMzKxLDBtgJN0P7Gxwf7OBlZJelfQ0sBE4Lt02Stok6TWyHsvsiCgBHwZuS9svB86s2tfydP824KRU38zMukAzYzAXR8QvI2JpRByUyqYAz1TV6Utlg5UfArwoaXdN+V77Ss+/lOqbmVkXGHYMZhA3AlcD5fTzeuBTQL0eRpn6gaw8RH2GeW4vETEfmA8gid7e3gF1enp66pY3Ylv6Odrt26mZ4+wm4+U4zbrZqAKMpMp3LhGxBPhRetgHHF5VdSqwJd2vV74DODAielIvpbp+ZV99EdEDHMAgp+okLQYWp4flHTt2DKjT29tLvfKRaHb7dmjFcXaDdh/n5MmTG6qXxhsfAZ6VdPpoklhGmihjVlSjOkUWEZOqHn4ceDzdXwWcHRFvTh+s6cBDwMPA9Ig4IiL2JUsEWCWpDNwLnJW2nwfcUbWveen+WcBPUn2zIrsE2FD1eERJLKNMlDErpEbSlG8BZgK9EdEHXAXMjIgZZKesNgOfBpD0REQIeBLYDSyQtCft52JgDdl/X0slPZFe4jJgZURcA/wCuDmV3wz8Q0RsJOu5nN300ZrlKCKmAn8OXAtcWpXEck6qshz4Mtkp5tnpPmRJLN9M9d9IlAGeTu//41K9jZI2pddameo+mfNhmY3asAFG0pw6xTfXKavUv5bsA1ZbvhpYXad8E/0foOryfwU+MVz7zArkvwNfBN6WHjecxBIRlSSWKcC6qn1Wb1ObKHN8qw/ArJVGO8hvZlUionIx8qMRMTMVjyaJZaSJMrXtGDbhBYZPkthWdb8dyRRFStooUlugWO0ZaVscYMxa40TgjIg4DfgTYH+yHs1Ik1hGmiizl0YSXmBkSRLtSKYoUnJKkdoCxWpPpS2NJr14LjKzFpB0haSpkqaRjRf+RNJ/YeRJLCNKlGnDoZmNmgOMWb4uIxvw30g2xlKdxHJIKr8UuByyRBmgkihzFylRJvWAKokyG7KqbyTKmBWST5GZtZik+4D70v0RJ7GMNFHGrKjcgzEzs1w4wJiZWS4cYMzMLBcOMGZmlgsHGDMzy4UDjJmZ5cIBxszMcuEAY2ZmuXCAMTOzXDjAmJlZLhxgzMwsFw4wZmaWCwcYMzPLxbCzKUfEUqCyWt/RqexvgY8BrwFPAedLejEippFNJf6btPk6SRelbY4BlgFvIZsR9hJJ5Yg4GLgVmAZsBkLSrrQ++SLgNOAV4DxJ61twzGZm1gaN9GCWAbNqytYCR0v6T8A/A1dUPfeUpBnpdlFV+Y1kS7lOT7fKPi8H7pE0HbgnPQY4taru/LS9mZl1iWEDjKT7yZZyrS67Oy2ABLCObPnWQUXEJGB/SQ+kVftWAGemp2cDy9P95TXlKySVJa0jW3p2UgPHZGZmBdCKMZhPAXdWPT4iIn4RET+NiA+ksilka41X9KUygImStgKkn4dWbfPMINuYmVnBNbWiZUT8NbAb+G4q2gq8Q9ILaczlhxFxFFCqs3l5mN03vE1EzCc7jYYkent7B9Tp6empW96IbennaLdvp2aOs5uMl+M062ajDjARMY9s8P+kdNoLSa8Cr6b7j0bEU8C7yXof1afRpgJb0v1tETFJ0tZ0Cmx7Ku8DDh9km71IWgwsTg/LO3bsGFCnt7eXeuUj0ez27dCK4+wG7T7OyZMnt+21zMaKUZ0ii4hZwGXAGZJeqSp/e0RMSPffSTZAvymd+no5Ik5I2WFzgTvSZquAeen+vJryuRFRiogTgJcqp9LMzKz4GklTvgWYCfRGRB9wFVnW2JuBtREB/enIHwS+GhG7gT3ARZIqCQKfoT9N+U76x20WAoqIC4DfA59I5avJUpQ3kqUpn9/MgZqZWXsNG2AkzalTfPMgdb8PfH+Q5x4Bjq5T/gJwUp3yMrBguPaZmVkx+Up+MzPLhQOMmZnlwgHGzMxy0dR1MOPNngvPeOP+hCWrOtgSM7Picw/GzMxy4QBjZma5cIAxM7NcOMCYmVkuPMhv1gIR8SfA/WQzXPQAt0m6KiKOAFYCBwPrgXMlvRYRbyZbtuIY4AXgk5I2p31dAVxANhvG5yStSeWzyBbhmwDcJGlhGw/RbMTcgzFrjVeBD0t6HzADmJXm0LsOuCEtqLeLLHCQfu6S9C7ghlSPiDgSOBs4imxRvm9HxIQ0x9+3yBbiOxKYk+qaFZZ7MGYtkKY2+mN6uE+6lYEPA+ek8uXAl8lWZ52d7gPcBnwzTQQ7G1iZZiZ/OiI2AselehslbQKIiJWp7pP5HZVZc9yDMWuR1NN4jGzJibXAU8CLVau/Vi+a98aCeun5l4BDGHyhPS/AZ13HPRizFpG0B5gREQcCtwPvrVOtsmjeYAvqDVZe75/BAQvwNbL4Hgy/YNu2qvvtWNitSAvIFaktUKz2jLQtDjBmLSbpxYi4DzgBODAielIvpXrRvMqCen0R0QMcAOxk6IX2hl2Ar5HF92BkC7a1Y2G3Ii2UV6S2QLHaU2lLowvw+RSZWQukxfYOTPffAnwE2ADcC5yVqtUuqFdZaO8s4CdpHGcVcHZEvDlloE0HHgIeBqZHxBERsS9ZIoDnK7JCc4Axa41JwL0R8UuyYLBW0o/IVn69NA3WH0L/Wko3A4ek8kuBywEkPQGIbPD+LmCBpD2pB3QxsIYscCnVNSssnyIzawFJvwTeX6d8E/1ZYNXl/0r/6q21z10LXFunfDXZSq9mXcE9GDMzy0VDPZiIWAqcDmyXdHQqOxi4FZgGbAZC0q6Uy78IOA14BThP0vq0zTzgS2m310hansqPAZYBbyH7D+0SSeXBXqOpIzYzs7ZotAezjOyq4mqXA/ekK5TvSY8hu9J4errNJ7uorBKQrgKOJztlcFVEHJS2uTHVrWw3a5jXMDOzgmsowEi6nyyFstpssiuTST/PrCpfIaksaR1ZmuYk4KNkA587Uy9kLdl0GpOA/SU9kLJoVtTsq95rmJlZwTUzyD9R0lYASVsj4tBUPtIrkaek+7XlQ73GXhq5uKyZi5W21SkryoVPtYp0UVaexstxmnWzPLLIRnqF8mDlDWvk4rJWX6xUlAufahXpoqw8tfs4G72wzMz6NZNFti2d3iL93J7KB7sSeajyqXXKh3oNMzMruGYCTPWVyLVXKM+NiFKarvyldJprDXBKRByUBvdPAdak516OiBNSBtpc6l/tXP0aZmZWcI2mKd8CzAR6I6KPLBtsIaCIuAD4Pf0Xja0mS1HeSJamfD6ApJ0RcTXZVc4AX5VUSRz4DP1pynemG0O8RqHsufAMACYs8cwdZmYVDQUYSXMGeeqkOnXLwIJB9rMUWFqn/BHg6DrlL9R7DTMzKz5fyW9mZrlwgDEzs1w4wJiZWS4cYMzMLBcOMGZmlgsHGDMzy4UXHBulyrUvZmZWn3swZmaWCwcYMzPLhQOMmZnlwgHGzMxy4QBjZma5cIAxM7NcOMCYmVkuHGDMzCwXDjBmZpYLBxgzM8uFA4yZmeVi1HORRcR7gFurit4J/A1wIHAh8Hwqv1LS6rTNFcAFwB7gc5LWpPJZwCJgAnCTpIWp/AhgJXAwsB44V9Jro22zmZm1z6gDjKTfADMAImIC8CxwO3A+cIOkr1fXj4gjgbOBo4DJwP+OiHenp78FnAz0AQ9HxCpJTwLXpX2tjIi/IwtON462zWZm1j6tmk35JOApSb+LiMHqzAZWSnoVeDoiNgLHpec2StoEEBErgdkRsQH4MHBOqrMc+DIOMFZAEXE4sAI4DHgdWCxpUUQcTNbTnwZsBkLSrogokfXaTwNeAc6TtD7tax7wpbTrayQtT+XHAMuAtwCrgUskldtygGaj0KoAczZwS9XjiyNiLvAI8AVJu4ApwLqqOn2pDOCZmvLjgUOAFyXtrlN/LxExH5gPIIne3t4BdXp6euqWN2Jbg/VGu/9WauY4u0kBj3M32Xt9fUS8DXg0ItYC5wH3SFoYEZcDlwOXAacC09PteLJ/nI5PAekq4FignPazKn2GbiR7n68jCzCzgDvbeIxmI9J0gImIfYEzgCtS0Y3A1WQfjquB64FPAaU6m5epn2hQHqL+AJIWA4srdXbs2DGgTm9vL/XKWynv/TeiHcdZBO0+zsmTJw/5vKStwNZ0/+XUA59C1nOfmaotB+4jCzCzgRWpB7IuIg6MiEmp7lpJOwFSkJoVEfcB+0t6IJWvAM7EAcYKrBVZZKcC6yVtA5C0TdIeSa8DS+g/DdYHHF613VRgyxDlO4ADI6Knptys0CJiGvB+4EFgYgo+lSB0aKo2hYE99ynDlPfVKTcrrFacIptD1emxiJhU+UABHwceT/dXAd+LiG+QDfJPBx4i66lMTxljz5KdbjtHUjki7gXOIsskmwfc0YL2muUmIvYDvg98XtIfhhiTHKyHPtLy2tcf9nQxDH+Ksfq0cDtORRbplGeR2gLFas9I29JUgImIf0eW/fXpquL/FhEzyN78myvPSXoiIgQ8SXa+eoGkPWk/FwNryNKUl0p6Iu3rMmBlRFwD/AK4uZn2muUpIvYhCy7flfSDVLyt8k9XOgW2PZUP1aOfWVN+XyqfWqf+Xho5XQwjO8XYjlORRTq1W6S2QLHaU2nLcKeMK5oKMJJeIRuMry47d4j61wLX1ilfTTZoWVu+if5TbGaFlbLCbgY2SPpG1VOryHrfC9m7F76KLBlmJdkg/0spCK0BvhYRB6V6pwBXSNoZES9HxAlkp97mAv8z9wMza0KrssjMxrsTgXOBX0XEY6nsSrLAooi4APg98In03GqyFOWNZGnK5wOkQHI18HCq99XKgD/wGfrTlO/EA/xWcA4wZi0g6WfUHyeB7Dqx2vplYMEg+1oKLK1T/ghwdBPNNGsrz0VmZma5cIAxM7NcOMCYmVkuHGDMzCwXDjBmZpYLBxgzM8uFA4yZmeXCAcbMzHLhAGNmZrlwgDEzs1w4wJiZWS4cYMzMLBcOMGZmlgsHGDMzy4UDjJmZ5cIBxszMctH0gmMRsRl4GdgD7JZ0bEQcDNwKTAM2AyFpV1pWdhHZSn6vAOdJWp/2Mw/4UtrtNZKWp/Jj6F/FbzVwSVqsyczMCqxVPZgPSZoh6dj0+HLgHknTgXvSY4BTgenpNh+4ESAFpKvI1iY/Driqak3yG1PdynazWtRmMzPLUV6nyGYDy9P95cCZVeUrJJUlrQMOjIhJwEeBtZJ2StoFrAVmpef2l/RA6rWsqNqXmZkVWNOnyIAycHdElIHvSFoMTJS0FUDS1og4NNWdAjxTtW1fKhuqvK9O+V4iYj5ZLwdJ9Pb2DmhkT09P3fJGbGuw3mj330rNHGc3GS/HadbNWhFgTpS0JQWRtRHx6yHqluqUlUdRvpcU1BZXnt+xY8eAjXp7e6lX3kp5778R7TjOImj3cU6ePLltr2U2VjR9ikzSlvRzO3A72RjKtnR6i/Rze6reBxxetflUYMsw5VPrlJuZWcE1FWAi4q0R8bbKfeAU4HFgFTAvVZsH3JHurwLmRkQpIk4AXkqn0tYAp0TEQWlw/xRgTXru5Yg4IWWgza3al5mZFVizPZiJwM8i4p+Ah4AfS7oLWAicHBG/BU5OjyFLM94EbASWAJ8FkLQTuBp4ON2+msoAPgPclLZ5CrizyTabmVkblMrlMXdJSXnLloFn0Zo5Z7/nwjNGvM2EJatG9VrN8hhMPtIYTL0xwSKr+1mA4X9/1e/5dryXi/S+LVJboFjtqbSl0c9DKwb5zWyMGM0/U2aD8VQxZmaWCwcYMzPLhQOMmZnlwgHGzMxy4UF+sxaIiKXA6cB2SUenMs8qbuOaezBmrbGMgTN9e1ZxG9ccYMxaQNL9wM6aYs8qbuOaA0wb7LnwjDduNq7sNas4kOus4mZF4zEYs/bLZVZxaGzpChh8uYN6S1O0Y1mEIi2/UKS2QLHaM9K2OMCY5WdbRExKayI1Oqv4zJry+xjBrOKNLF0BI5t+pB3TlBRxOpSiKFJ7aqaKGZZPkZnlx7OK27jmHkxOPN4yvkTELWS9j96I6CPLBlsIKCIuAH4PfCJVX02WoryRLE35fMhmFY+IyqziMHBW8WVkacp34lnFrQs4wJi1gKQ5gzx1Up26ZWDBIPtZCiytU/4IcHQzbTRrN58iMzOzXDjAmJlZLhxgzMwsF6Meg4mIw8muKD4MeB1YLGlRRHwZuBB4PlW9UtLqtM0VwAXAHuBzktak8llkczNNAG6StDCVHwGsBA4G1gPnSnpttG02M7P2aaYHsxv4gqT3AicACyLiyPTcDZJmpFsluBwJnA0cRTaP0rcjYkJETAC+RTY/05HAnKr9XJf2NR3YRRaczMysC4w6wEjaWpkBVtLLwAaGnr5iNrBS0quSniZL0Twu3TZK2pR6JyuB2Snf/8PAbWn76rmczMys4FqSphwR04D3Aw8CJwIXR8Rc4BGyXs4usuCzrmqz6vmUaudfOh44BHhR0u469c3MrOCaDjARsR/wfeDzkv4QETcCV5PNlXQ1cD3wKQafT6leL6rl8y81M59PvfmZRivvOYWKNG9RnsbLcZp1s6YCTETsQxZcvivpBwCStlU9vwT4UXo42PxLDFK+g2wa857Ui2lq/qWizOeTdxuKcpx5a/dxNjr3kpn1ayaLrATcDGyQ9I2q8kmVKcqBjwOPp/urgO9FxDeAyWSLJj1E1lOZnjLGniVLBDhHUjki7gXOIhuXqZ7LycxaxNMaWV6a6cGcCJwL/CoiHktlV5Jlgc0gO521Gfg0gKQnIkLAk2QZaAsk7QGIiIvJJvqbACyV9ETa32XAyoi4BvgFWUAzM7MuMOoAI+ln1B8nWT3ENtcC19YpX11vO0mbyLLMzMysy/hKfjMzy4UDjJmZ5cIBxszMcuEA02Z7LjzDWTtmNi44wJiZWS4cYMzMLBdeMrkAqk+ZTViyqoMtMTNrHQeYQeQ9TuJxGDMb63yKzMzMcuEAY2ZmuXCAMTOzXDjAmJlZLhxgzMwsF84iKxinLJvZWOEA0wUcdKzVtn38zzrdBBsHHGAKzNfKWBH4HxwbLY/BmJlZLtyDMbOG1etVu1djgyl8gImIWcAiYAJwk6SFHW5SRw172uz2n7enIdYR/jxYNyn0KbKImAB8CzgVOBKYExFHdrZV3cnr0HS/on4eKu8tv7+sVtF7MMcBGyVtAoiIlcBs4Mk8XmwsfECGyw4abMDWA7ldoa2fh9EY7jPk99b4UvQAMwV4pupxH3B8K3Y8FoJJswb7HXTqd+Mvn2Hl9nlol8HeW9tGsA+/T7pH0QNMqU5ZubYgIuYD8wEkMXny5Lo726v8x4+0pIHWOYP9ncewYT8PjX4W/P4fWtHeW0Vqz0jaUugxGLL/0A6vejwV2FJbSdJiScdKOpbsQzjgFhGPDvbcWLr5OHO9ddqwn4dGPgsU8H1SpPYUqS1Fa09NW4ZV9B7Mw8D0iDgCeBY4Gzins00y6xh/HqyrFLoHI2k3cDGwBtiQFemJzrbKrDP8ebBuU/QeDJJWA6tbsKvFLdhHN/BxjmFj+PNQpPYUqS1QrPaMqC2lcnnAmLmZmVnTCn2KzMzMupcDjJmZ5aLwYzCjERF/SnaF8xSy6wS2AKskbehow8w6xJ8J64QxNwYTEZcBc4CVZNcNQHa9wNnASk8OaONN0T4TEVEim/amOtg9JKljX0YRMbG6PZJGMrlAq9tSuN/PaI3FAPPPwFGS/q2mfF/gCUnTO9Oy1ouIA4ArgDOBt6fi7cAdwEJJL3aqbXkp0hdBtyjSZyIiTgG+DfyW7FoeyILdu4DPSrq7XW1J7ZkB/B1wQE17XkztWd/m9hTt93MAMIu9g92aRr9bxuIpsteBycDvasonpefGEgE/AWZKeg4gIg4D5gH/CJzcwba11GBfBBHRkS+CLlOkz8Qi4COSNlcXpotHVwPvbXN7lgGflvRgTXtOAP4eeF+b21OY309EzAWuAu6m/zP3IeBrEfEVSSuG28dYDDCfB+6JiN/SPzHgO8j+A7i4Y63KxzRJ11W0J7KLAAAEI0lEQVQXpEBzXUR8qkNtyssyivVF0E2K9Jnoof80XbVngX3a3BaAt9a+pwAkrYuIt3agPUX6/fw1cExtbyUiDgIeBMZfgJF0V0S8m/5zmCWyP9jDkvZ0tHGt97uI+CKwvHKqKJ1COo+9Z90dC4r2RdA1CvaZWAo8nJYaqLxHDycbD7q5zW0BuDMifkz2ZVndnrnAXR1oT5F+PyXqTC5M1uttaC6yMTcGM56k/yQuJ8sOmkj2ZtgGrAKuk7Szg81rqYj4H8B/oP4XwdOSxlrvdMyKiPfSn9FWCXarJHVkXZuIOHWQ9rRixoTRtKcQv5+ImAf8Ddkpsuqe78nA1ZKWDbcPB5gxJCI+QPZf6q/aPRjYDkX7IjAb69I/sR9l78/cGkm7GtneAaaLRcRDko5L9/8rsAD4IXAK8L+ckm1FExGzJN2V7h8AXE/2T9HjwF+1OyuwKhNzNnBoKu5YJmbRfj+pHaPO3BxzYzDjTPWg36eBUyQ9HxFfB9YBYybAFO2LwEbta/SPbVwPPAd8DPgL4DtkKfftVMnE/FBNJuZ5dCYTszC/n5rMzT6yHsyIMjcdYLrbm1IX9k1ASdLzAJL+JSJ2d7ZpLVe0LwJr3rGSZqT7N6Rz/u02WCbmwog4vwPtqdbp388ymszcdIDpbgcAlRXmyhFxmKTnImI/Gszy6CJF/iKwxh0aEZeSvT/3j4hS1RXqnZgbsWiZmEX6/TSduekA08UkTRvkqdeBj7exKe1QtC8CG50lwNvS/eVAL/B86o0+1oH2fJIsE/On6f1UnYkZHWhPkX4/Tadwe5DfukJNSnZlDKbyRbCw0awW67w08eYU4EFJf6wqf2OAu1M6nYkZEZ8DbpdUiH+ams3cdICxrhcR50v6+063w4YXEX9JNnvABmAGcImkO9Jz6yX95za3p1CZmBHxEvAvwFPALcA/VsZWu5FPkdlY8BWyQUcrvvlk04/8MSKmAbdFxDRJi+jMuGHRMjE3AccAHyE7ffeViHiULNj8QNLL7WpIKzI3HWCsK0TELwd5qkQ2i4F1hwmV02KSNkfETLIg8+/pTIApWiZmWdLrZFfP3x0R+wCnki238HX6Z01vh6YzN72ipXWLiWSDix+rc3uhg+2ykXkuXV8BQAo2p5MNZv/HDrSnkon5CHBw+gKlg5mYe72mpH+TtErSHLJpWtppmqTrKsEltee5dNqwoba4B2Pd4kfAfpIGZNJExH3tb46N0lxgr56BpN3A3Ij4TrsbU8BMzE8O9oSk/9fOhtCCzE0P8puZ2QCtyNx0gDEzsxFpNHPTYzBmZjZSX2mkksdgzMxsgFZkbjrAmJlZPRPJ1oKpHWspAT9vZAcOMGZmVk/TmZse5Dczs1x4kN/MzHLhAGNmZrlwgDEzs1w4wJiZWS4cYMzMLBf/H7FlM7X2QAlGAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "messages.hist(column='length', by='label', bins=50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理数据\n",
    "\n",
    "使用 Jieba 进行分词，分词数据来源：[yanyiwu/iosjieba](https://github.com/yanyiwu/iosjieba)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-01-04 16:37:29--  https://raw.githubusercontent.com/yanyiwu/iosjieba/master/iosjieba/iosjieba.bundle/dict/jieba.dict.small.utf8\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.108.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1552161 (1.5M) [text/plain]\n",
      "Saving to: ‘dict/jieba.dict.small.utf8’\n",
      "\n",
      "dict/jieba.dict.sma 100%[===================>]   1.48M   543KB/s    in 2.8s    \n",
      "\n",
      "2019-01-04 16:37:32 (543 KB/s) - ‘dict/jieba.dict.small.utf8’ saved [1552161/1552161]\n",
      "\n",
      "--2019-01-04 16:37:32--  https://raw.githubusercontent.com/yanyiwu/iosjieba/master/iosjieba/iosjieba.bundle/dict/user.dict.utf8\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.108.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 33 [text/plain]\n",
      "Saving to: ‘dict/user.dict.utf8’\n",
      "\n",
      "dict/user.dict.utf8 100%[===================>]      33  --.-KB/s    in 0s      \n",
      "\n",
      "2019-01-04 16:37:33 (1.12 MB/s) - ‘dict/user.dict.utf8’ saved [33/33]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!mkdir dict\n",
    "!wget -c https://raw.githubusercontent.com/yanyiwu/iosjieba/master/iosjieba/iosjieba.bundle/dict/jieba.dict.small.utf8 -O dict/jieba.dict.small.utf8\n",
    "!wget -c https://raw.githubusercontent.com/yanyiwu/iosjieba/master/iosjieba/iosjieba.bundle/dict/user.dict.utf8 -O dict/user.dict.utf8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "removing punctuations: !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from /Users/qiwihui/Development/nonWork/SMSFilters/model/dict/jieba.dict.small.utf8 ...\n",
      "Dumping model to file cache /var/folders/kl/g94q0k_571vdjtcwzzcv20s40000gn/T/jieba.u863e59d49beec8805b7d0b2ea2fa7f35.cache\n",
      "Loading model cost 0.395 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1    [商业, 秘密, 的, 秘密, 性, 那, 是, 维系, 其, 商业, 价值, 和, 垄断,...\n",
       "2    [南口, 阿玛, 施, 新春, 第一批, 限量, 春装, 到, 店, 啦, ,  ,  ,...\n",
       "3                   [带给, 我们, 大, 常州, 一场, 壮观, 的, 视觉, 盛宴]\n",
       "4                        [有, 原因, 不明, 的, 泌尿, 系统, 结石, 等]\n",
       "5                  [23, 年, 从, 盐城, 拉, 回来, 的, 麻麻, 的, 嫁妆]\n",
       "Name: message, dtype: object"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"removing punctuations: \"+string.punctuation)\n",
    "\n",
    "import jieba\n",
    "jieba.set_dictionary(\"./dict/jieba.dict.small.utf8\")\n",
    "jieba.load_userdict(\"./dict/user.dict.utf8\")\n",
    "jieba.initialize()\n",
    "\n",
    "def tokenize(message):\n",
    "    \"\"\" removes punctuation and tokenizes the words and stems each word.\n",
    "    \"\"\"\n",
    "    msg = \"\".join([ch for ch in message if ch not in string.punctuation]) # get rid of punctuations\n",
    "    stems = list(jieba.cut(msg, HMM=True))\n",
    "    return stems\n",
    "\n",
    "messages.message.head().apply(tokenize)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征\n",
    "\n",
    "特征抽取，TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "总词数 366750\n"
     ]
    }
   ],
   "source": [
    "fv = CountVectorizer(analyzer=tokenize).fit(messages.message)\n",
    "\n",
    "print(\"总词数\", len(fv.vocabulary_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First:\n",
      "  (0, 103421)\t1\n",
      "Second:\n",
      "  (0, 103378)\t1\n",
      "  (0, 162812)\t1\n"
     ]
    }
   ],
   "source": [
    "print(\"First:\")\n",
    "print(fv.transform([\"你好\"]))\n",
    "\n",
    "print(\"Second:\")\n",
    "print(fv.transform([\"好你\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "带给我们大常州一场壮观的视觉盛宴\n",
      "  (0, 76144)\t1\n",
      "  (0, 154901)\t1\n",
      "  (0, 157129)\t1\n",
      "  (0, 182155)\t1\n",
      "  (0, 182558)\t1\n",
      "  (0, 199561)\t1\n",
      "  (0, 274312)\t1\n",
      "  (0, 275510)\t1\n",
      "  (0, 317302)\t1\n"
     ]
    }
   ],
   "source": [
    "print(messages.message[3])\n",
    "print(fv.transform([messages.message[3]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def default(o):\n",
    "    if isinstance(o, np.int64): return int(o)  \n",
    "    raise TypeError\n",
    "\n",
    "import io\n",
    "with io.open(\"result/words_array.json\",'w',encoding=\"utf-8\") as outfile:\n",
    "    outfile.write(str(json.dumps(fv.vocabulary_, ensure_ascii=False, default=default)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Messages Feature Vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(800000, 366750)\n"
     ]
    }
   ],
   "source": [
    "messages_fv = fv.transform(messages.message)\n",
    "print(messages_fv.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  (0, 317302)\t0.39779219779562225\n",
      "  (0, 275510)\t0.40156911498600234\n",
      "  (0, 274312)\t0.08158076058986856\n",
      "  (0, 199561)\t0.21453331213319765\n",
      "  (0, 182558)\t0.30040221221192215\n",
      "  (0, 182155)\t0.40156911498600234\n",
      "  (0, 157129)\t0.2305152019929351\n",
      "  (0, 154901)\t0.4609408713995319\n",
      "  (0, 76144)\t0.3327471446311214\n"
     ]
    }
   ],
   "source": [
    "tfidf = TfidfTransformer().fit(messages_fv)\n",
    "\n",
    "# test tfidf of same message as before.\n",
    "t = tfidf.transform(fv.transform([messages.message[3]]))\n",
    "print(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(800000, 366750)\n"
     ]
    }
   ],
   "source": [
    "messages_tfidf = tfidf.transform(messages_fv)\n",
    "print(messages_tfidf.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IDF of corpus : [ 9.82168363 13.89922108  9.72483381 ... 13.49375597 12.0274189\n",
      "  7.8577766 ]\n"
     ]
    }
   ],
   "source": [
    "idf = {}\n",
    "idf['idf'] = tfidf.idf_.tolist()\n",
    "\n",
    "with io.open(\"result/words_idf.json\",'w',encoding=\"utf-8\") as outfile:\n",
    "    outfile.write(str(json.dumps(idf, ensure_ascii=False)))\n",
    "\n",
    "print(\"IDF of corpus :\", tfidf.idf_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型训练\n",
    "\n",
    "Linear SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.74 s, sys: 256 ms, total: 3 s\n",
      "Wall time: 3.16 s\n",
      "accuracy 0.99965375\n",
      "confusion matrix\n",
      " [[719972     28]\n",
      " [   249  79751]]\n",
      "(row=expected, col=predicted)\n"
     ]
    }
   ],
   "source": [
    "%time spam_detector = LinearSVC().fit(messages_tfidf, messages.label)\n",
    "\n",
    "predictions = spam_detector.predict(messages_tfidf)\n",
    "\n",
    "print('accuracy', accuracy_score(messages['label'], predictions))\n",
    "print('confusion matrix\\n', confusion_matrix(messages['label'], predictions))\n",
    "print('(row=expected, col=predicted)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 0, 'predicted label')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAASIAAAD0CAYAAAA/riswAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xm8FNWZ8PHfARRNouKGIhfEhRiXcUMFx2hww6smwTcTHzWJgqOS8LrNGGM0rwlEMYN586pkXEZQIiQT8ZGEERMMQZRgZlwubknEOEFFuYKgsogxymDq/eOcDmXb1V0N1VT3ref7+fTndp86Ved0c3nuWarPcVEUYYwxeeqWdwWMMcYCkTEmdxaIjDG5s0BkjMmdBSJjTO4sEBljcmeBKAfOuW2dczOcc2875yLn3ICMrhs5576SxbVagXNuaHjPbXnXxWwaC0T5GA0cCRwF9AGWZHTdPsD0jK6VC+fcIufc2JTZ/wv/npc2rkZmc+iRdwUKaiDwXBRFv8/yolEUvZ7l9ZqZc27LKIrWAYV5z12ZtYjKOOcudM4tdM6975xb4ZybHju2jXPudufcG86595xzC5xzw2LHB4Sugjjn7nfOveuce8k5d3Ysz2LgPOC4kHdeKd05d3VZXe4oHQ+vP+2c+0/n3NrweNY5d1Ls+Ie6Zs65Ps65ac651c65vzjn5jnnDosdL3VtTnTOzQ/1XRi/ZsJnNDa0XMQ596dw3n+ELucXnHMvhPpNd85tFzvvUOfcA+Fzfcc51+Gca48dnwfsBYwJ9YrCZ1qq56nOud86594DRpV3zZxz3wzvdUDsmmOcc2+1Qvdt8eLFEZD2sTifWjZIFEX2CA/gu8A7wEXAJ4FDgatjx+/F/wKcBOwLTADWAZ8Kxwfgf0leAgTYGxgPrAcGhjw7A/cA84FdgR1C+uJ4WSHtDmBeeN4dWAncgG9RDQT+F3B0LH8EfCU8d8DjwDPAp4G/C+WuAnYKeYaGc54F2sM1pwKrgV5VPqexwJ+BXwIHAp8B3gB+DcwCDgKOBpYD18fOGwqMAPYLn++48Pl9MhzfAXgZ+EH4bHYN77tUzz8Cnwf2ANpi6W2x9zwbeBTf2j8a+B/g83n/bqV8RLEAXPUR5F3f7P7v5V2BZnkAHwf+AlyecHzv8Et/Sln6U8Dk8LwUiC6LHe+BD25fjaXdBTxYdp1agWj7cO2hVd5DPBAdH17vFzveE1gGfCe8Lv1H/kIsz64h7aQq5YzFB9edYmm3AB8AO8fSJgALanzuzwL/J/Z6ETC2LE+pnmcnpLfF0nqH93grfuxtQt6/W3U8om7duqV6BHnXN7OHjRFtsD+wFf6veiX7hZ/zy9Ln4wee454pPYmiaL1zbjmwy6ZULoqiVc65O4DZzrmHgN8AM6IoeiHhlP2Bt6IoWhi7xvvOucfDsaT6vu6c+yBFfV+LoujN2OvXgdejKHqjLK136YVzbmd8q/M4fMDrgf/Md69RVskTtTJEUbTCOfeP+JbZs8AVKa/dFJxzmVxHRPbBt4BL9gS+g2/x3oP/o7kYEFVdJSIO/4fjFOBdYKSqPhWuNQIoDRuMU9UpIX0Q/o/q1vjP+1JVjURkh0plVKuvjRF9VL3LEbgK56yrcM1an/Vfw7XitvjQRaLoAmAQMAffHfqDc+6rVa5Z6b2kqS/Uru//VCirUlr8Onfhu0tXhJ8H44PgljXKKvlzynyfwbfOdgG2q5G3qTjnUj1qUdUXVPVgVT0Y/zvzLjADuBKYq6oDgbnhNcDJbOjyjwJuAwhBZQwwGDgCGCMi24dzbgt5S+eVxvuSykhkgWiDhcB7+PGfSp4LP48pSz86dmxTrAB2K0s7pDxTFEV/iKLohiiKTgbuxP8iVPIcsJNzrtSSwznXE//LlEV9N8YxwK1RFM2M/IzhMvxf6rh1+HGhjeKcOwG4HD+W9AowxWXVzNgMsgpEZY4HXlTVV4DhwJSQPgU4LTwfDkxV1UhVHwN6iUgf/P+HOaq6MrRq5gDt4di2qvqoqkb4llb8WpXKSGSBKIii6B3g/wFjw8zZJ51zBznnrgrHX8QPVt/qnDvJOfcp59wE4IBBgwY9KiIvtLe3z9uEKjwInOGcG+ac28c5dyOxLotzbm/n3PVh5mx359yR+CC4MOF6D+G7Mj91zh3lnDsA/8uyFeGvXQ5eAL7snPs759zBwN18NOi8DBzlnOvvnNvJOddt8ODBVwCccsopD1a7eOj6/Rj4QRRFs4CzgL8HLsv6jTRKPYFIRBbEHkl/kADOxH/WALuo6jKA8LPUde7Lh+9n6wxp1dI7K6RXKyORjRF92Lfxsz+XADfiZ5jiY0LnA/8X+AmwLfD7bt26fX7PPfe8BTjxkUce6Q78d79+/QYAv62z7OvxgecefBfnVnzg2zsc/zO++TsNP/P2Fn7W6vJKF4uiKHLOnRbexy/xA9VPACeWje1sTucCt4d6LAe+D3ysLM+YkOcFfNDc48033/wVvuuQKLR67sK3gr4NEEXRy865rwF3Oed+E0XRguzeSmPU09pR1cNq5RGRLfGtw6tqFV0hLdqI9I1igSgm8tMuE8Kj0vG3ga+GBwAiciSwSFVfCq+/BfQrO2/vstcjK1x7LXB2eXrs+DLgCzXq78peL8P/NUzKP48Kv1BRFFX9vYiiaCx+5iyeNg4/HR9PG4+/faH0+vf4FkrcrWXnLMCPacT9UERmAr+oUf9TK9T1bja0Bppet26Zd1JOBp5S1eXh9XIR6aOqy0L3akVI7+TDv7dt+DvWO/Gzk/H0eSG9rUL+amUksq7ZpktquhpTtwaMEZ3FhwPxTPy9XISf98XSzxERJyJDgDWhWzUbGCYi24dB6mHA7HBsrYgMCTNu55Rdq1IZiSwQbbpMm6im2LIMRCLyMeBE4Oex5PHAiSLyp3Cs1GKdhb8RdxEwCfjfAKq6ErgW6AiPa0Ia+O9M3hHOeRF4oEYZiaxrtumSmrTG1C3LCT5VfRfYsSztLfwsWnneCLgw4TqTgckV0hcAB1RIr1hGNRaINl0HMFBE9gBew4/JfCnfKplW1UJ3GmTK+fFZsylE5BTgJvxU9GRVvS7nKnUpInI3fsB0J/xs2xhVvTPXSjVGtN126e6/XLNmDVQeFmhJFoiMaR5Rr169UmVcvXo1dKFAZF0zY5pIA6bvW4IFImOaSFHHiCwQGdNELBAZY3JX1EBUzA5pA9T40qHJQBE+4wZ9+77pWSDKTpf/T9IEuvxnXNRAZF0zY5pIVwwyaTTzfURNWzFj6pQ2ukS77Va+Nl5lS5curee6Ta+pW0St9Neho6ODww8/PO9q1KWJ/wgVViv9zmepqQORMUVjgcgYkzsLRMaY3FkgMsbkzgKRMSZ39qVXY0zurEVkjMmdBSJjTO4sEBljcmeByBiTOwtExpjcZRmIRKQXft+xA/Df3fxH/Fbe9wADgMWAqOqqsEniBOAU4F1gpKo+Fa4zArg6XHacqk4J6YPw23xvjd8X7VJVjURkh0plVKtrMecKjWlS3bp1S/VIaQLwK1X9FHAQ8DxwJTBXVQcCc8Nr8FtTDwyPUcBtACGojAEGA0cAY8KOr4Q8o2LntYf0pDKS33fad2SMabys1iMSkW2BY4A7AVR1naquBoYDU0K2KcBp4flwYKqqRqr6GNAr7Ft/EjBHVVeGVs0coD0c21ZVHw2bM04tu1alMhJZ18yYJlJP10xEFsReTlTVibHXewJvAD8SkYOAJ4FLgV3CvvWo6jIR6R3y9wWWxM7vDGnV0jsrpFOljEQWiIxpIvUEIlU9rMrhHsChwMWq+riITKB6F6lSwdFGpG8U65oZ00QyXCq2E+hU1cfD6+n4wLQ8dKsIP1fE8veLnd8GLK2R3lYhnSplJLJAZEwTySoQqerrwBIR2SckHQ8sBGYCI0LaCOC+8HwmcI6IOBEZAqwJ3avZwDAR2T4MUg8DZodja0VkSJhxO6fsWpXKSGRdM2OaSMb3EV0M/LuIbAm8BJyLb3yoiJwHvAqcHvLOwk/dL8JP358LoKorReRaoCPku0ZVV4bno9kwff9AeACMTygjUVOvWd1KN3fZUrGmitRrVh9yyCGpMj799NP1XLfpWYvImCbSSn98s2SByJgmYoHIGJM7C0TGmNxZIDLG5M4CkTEmdxaIjDG5s8XzjTG5sxaRMSZ3FoiMMbmzQGSMyZ0FImNM7iwQGWNyZ4HIGJM7m743xuTOWkTGmNxZIDLG5M4CkTEmdxaIjDG5s0BkjMmdBSJjTO5s+r7BRKQdmAB0B+5Q1fGbq2xjWoW1iBpIRLoDtwAn4neI7BCRmaq6cHOUb0yryDIQichiYC3wAbBeVQ8TkR2Ae4ABwGJAVHVV2CRxAn5vs3eBkar6VLjOCODqcNlxqjolpA9iw75ms4BLVTVKKqNaXTdXO/AIYJGqvqSq64BpwPDNVLYxLSPDLadLjlXVg1X1sPD6SmCuqg4E5obXACcDA8NjFHAbQAgqY4DB+P/HY8KOr4Q8o2LntdcoI9Hm6pr1BZbEXnfi39iHiMgo/BtDVeno6CjP0rT23XfflqqvaU6boWs2HBgank8B5gHfDOlTVTUCHhORXmHf+qHAnNLuriIyB2gXkXnAtqr6aEifCpyG3+01qYxEmysQVfp0P7LNqKpOBCaWjrfSzqm206vJQj2BSEQWxF5ODP9/4iLg1yISAbeH47uEfetR1WUi0jvkrdRY6FsjvbNCOlXKSLS5AlEn0C/2ug1YupnKNqZl1DNrFutuJTlKVZeGQDBHRP5YJW9SY6He9I2SGIhE5JE0F1bVY1KU0wEMFJE9gNeAM4Evpa2kMUWRZddMVZeGnytEZAZ+jGe5iPQJLZU+wIqQPamx0MmGblYpfV5Ib6uQnyplJKrWIrqj1slpqep6EbkImI2fvp+sqs9ldX1juoqsApGIfBzopqprw/NhwDXATGAEMD78vC+cMhO4SESm4cdv14RAMhv4XmyAehhwlaquFJG1IjIEeBw4B/jX2LUqlZEoMRCVpuiyoqqz8FN8xpgEGbaIdgFmiAj4/+c/VdVfiUgHoCJyHvAqcHrIPws/db8IP31/LkAIONfiezUA15QGroHRbJi+fyA8wAegSmUkcmkGLMM9BucDZwE7qeqBInIMsKuqas0LbJyolW7ussFqU0XaX+Toi1/8YqqM06dPr+e6TS/tyNg1wHn4Ga3+Ia2TGlNyxpj6NOA+opaQNhCNBD6rqtPYMID9MrBnIyplTFEVNRClnb7vDrwTnpcC0SdiacaYDBT1S69p3/Us4AYR6Ql/GzO6Fri/URUzpoiK2iJKG4guA3YD1gDb4VtCu2NjRMZkqqiBKFXXTFXfBk4Ld2juDixR1dcbWjNjCqgrBpk0UndIRaQXfhmPocDxsRucjDEZKWqLKFUgEpHj8OuKXAIcDlwMvCwixzeuasYUT1EDUdpZs5uBUfGbF0XkdPxiZ59qRMWMKaKuGGTSSNs12w34WVnaDGDXbKtjTLF169Yt1aOrSfuOpgIXlqWNDunGmIxY16xM2TIg3YDRInIFfhmPvvgv1T3W8BoaUyBdMcikUc8yIJMaWRFjjAWij8h6GRBjTG0WiGoQkV3wK7ztRGz5AVWd3IB6GVNIFoiqEJHTgJ8AfwL2B54DDgB+C1ggMiYjRQ1EaWfNxgHnquohwJ/Dz1HAkw2rmTEFZNP31fVX1XvL0qbg16k1xmSkqNP3aQPRijBGBLBYRI4E9sKvU2SMyYgFouomAZ8Oz28EHgaeBW5tRKWMKaqiBqK0y4BcH3s+NWw3+3FVfb5RFTOmiLpikEljo3Z6VdVXs66IMcYC0UeIyBLS7fTav1YeY0w6WQciEekOLABeU9XPht2WpwE7AE8BZ6vqurAM9FRgEPAWcIaqLg7XuAq/i88HwCWqOjuktwMT8GPFd6jq+JBesYxq9aw2RvQV4OwUD2NMRhowfX8pEB9CuR64UVUHAqvwAYbwc5Wq7o0fB74eQET2w28Rvz/QDtwqIt1DgLsFOBnYDzgr5K1WRqJqX/H4Tco3aozJSJYtIhFpA04FrgMuC5teHAd8KWSZAowFbgOGh+cA04GbQ/7hwDRVfR+/GOIi/DcsABap6kuhrGnAcBF5vkoZiTZqjMgY0xj1BCIRWRB7OVFVJ5ZluQm4AtgmvN4RWK2q68PrTvxKGoSfSwBUdb2IrAn5+/LhVTbi5ywpSx9co4xEFoiMaSL1BCJVPSzpmIh8Flihqk+KyNDS5StkjWocS0qv1D+slr+qrnevuDEtLMP7iI4CPi8ii/EDx8fhW0i9RKTUAGkDlobnnUA/gHB8O2BlPL3snKT0N6uUkcgCkTFNJKtApKpXqWqbqg7ADzY/pKpfxt+M/MWQbQRwX3g+M7wmHH9IVaOQfqaI9AyzYQOBJ4AOYKCI7CEiW4YyZoZzkspIVG36/sekm76375sZk5HN8IXWbwLTRGQc8DRwZ0i/E/hxGIxeiQ8sqOpzIqLAQmA9cKGqfgAgIhcBs/HT95NV9bkaZSRyUVQ51ojImNjLnfCR7X7gFaA/8Dlgiqpekurt1y9qpZu7Ojo6OPzww/OuRl2S/u1N5tL+IkdXXnllqozjx4+v57pNr9r0/XdLz0VkNnCqqj4SS/s08O3GVs+YYmmlP75ZSjtrNoSPLpT/OHBkttUxptiKGojSdkifBr4nIlsDhJ/XAc80qmLGFJF9+766kcBPgTUisgrYHv/9lS83qF7GFFJXDDJppF0GZDHw9yLSD7/r6zL7Br4x2bNAVIOI7AgMBfqo6vdFZDegm6p2NqpyxhRNV1yPOo20u3h8BvgZvjt2FPB9/I1Nl+On8Rui1aaXW62+nZ2t9Tekd+/erFixIu9q1KWtra2u/EVtEaUNvzfh1ydpx9/UBH7W7IjkU4wx9bLB6uoGqOrc8Lz0Z39dHecbY1LoikEmjbQtooUiclJZ2gnA7zOujzGFZi2i6r4O/EJEfglsLSK348eGhjesZsYUUFcMMmmkahGp6mPAgfitpicDLwNHqGpHA+tmTOFYi6gKEblcVX+Any2Lp1+mqjc0pGbGFFBRp+/TvuvvJKRfnVVFjDHWIqpIRI4LT7uLyLF8eNmBPYG1jaqYMUXUFYNMGrW6ZqUFjbbCjw2VRMBy4OJGVMqYorJAVIGq7gEgIlNtJUZjGq+ogSjtGNEN4QuvfyMi/UTkoAbUyZjCKuoYUdpA9BNgi7K0LYEfZ1sdY4rNAlF1/Us7Opao6ovAgMxrZEyBNWDL6ZaQ9h11isih8YTwuuZ+RcaY9IraIkr7FY8bgftE5PvAi8Be+CVArmtUxYwpoq4YZNJIu0LjJBFZDZyH391xCfB1VZ3eyMoZUzRZBSIR2QqYD/TE/z+frqpjwiaJ04AdgKeAs1V1nYj0BKYCg4C38Mv+LA7Xugr/f/8D4BJVnR3S24EJ+H3N7lDV8SG9YhnV6pu6s6mq96pqu6ruH35aEDImYxl2zd4HjlPVg4CDgXYRGQJcD9yoqgOBVfgAQ/i5SlX3xveArgcQkf3wmy3uD7QDt4pIdxHpDtwCnAzsB5wV8lKljERpv2vmgPNDhXZW1QNF5BhgV1XVNNcwxtSWVYsobP38Tni5RXhEwHHAl0L6FGAscBt+JY2xIX06cHP4fz8cmKaq7wMvh51gSwsiLipNYonINGC4iDxfpYxEaceIrgFOxK/U+G8hrRMfOS0QGZORegKRiCyIvZyoqhPLjncHngT2xrdeXgRWq2ppldVOoG943hc/5IKqrheRNcCOIT2+p2H8nCVl6YPDOUllJKpnO6FDVPVNESlFtpfx3zczxmSknql5VT2sxvEPgINFpBcwA9i3QrbSiquVImBUJb1SRavlryrtu+7OhmZe6aKfiKUZYzLQiOl7VV0NzMPv2NxLREoNkDY23ILTiZ+IIhzfDlgZTy87Jyn9zSplJEobiGbhv+bRM1TUAdcC96c83xiTQlaBSER2Di2h0s7MJwDPAw8DXwzZRgD3heczw2vC8YfCONNM4EwR6RlmwwYCTwAdwEAR2UNEtsSPH88M5ySVkShtILoMv7HiGnykfAfYHfhmyvONMSlk2CLqAzwsIr/DB405qvoL/P/Zy8Kg845sWGHjTmDHkH4ZcCWAqj6HHwdeCPwKuFBVPwhjQBcBs/EBTkNeqpSR/L7r2YtLRHrjA9ASVX099Ykbp7U2CWtBtq9Z44V9zdL2paJJkyalynjBBRfUc92ml3pkLDTzTsTv9nq8iGzfqEoZU1RF/YpHqkAUVmpcDFwCHI5fEO1lETm+cVUzpniK+qXXtNP3NwOj4jcvisjp+HsTPtWIihlTRF2xtZNG2tC6G/CzsrQZwK7ZVseYYrOuWXVTgQvL0kaHdGNMRooaiNJ2zQ4FRovIFcBr+Fu2ewOPi8j8UiZVPSb7KhpTHF0xyKSRNhBNCg9jTANZIKpCVac0uiLGmOIGorTT93eIyMfK0vqIyK8aUy1jiqmo0/dp39E2wO9E5EgAETkT+B3wdKMqZkwRFXWwOlUgUtUzgDH4dasfAcYBp6nqVY2snDFFU9RAlHawGvxs2Xv4NYgW4hdZMsZkqCsGmTTSjhH9AL8Y9iX4vcyewXfVTm9c1YwpHmsRVbcvcJCqLg+vvyEi9+PXo723ITUzpoC6YpBJI+30/akV0uaLyIHZV8mY4rJAVIOInAichd/F43MichiwLfBQoypnTNF0xan5NNKOEV2M3w7kv4HS1zj+gp89M8ZkpKhjRGnD7z8BJ4SdHP8a0v4I7JPmZBGZLCIrROQPG1FHYwrDAlF127BhD6PSEq5bAFW3kY25C79LpDGmCgtE1c0nLKYdcwl+tf6aVHU+fmsSY0wVRQ1EaQerLwbuF5ELgG1E5AXgbeBzWVZGREYBowBsJ+vG6927d95VqEuPHj1ars716opBJo200/fLRORw/HrVu+O7aU+o6l+rn1mfsGVuadtc28WjwVptR4wW3sUjNQtENYSN054ID2NMA2Q1fS8i/fArqO6Kn2CaqKoTRGQH4B78NyQWA6Kqq8KmqROAU4B3gZGq+lS41gjg6nDpcaVlgURkEH78d2v8JqyXqmqUVEbV953JuzbGZCLDMaL1wNdVdV/8VtMXish++LHeuao6EJjLhrHfk/G7uA7ED4/cBhCCyhhgMHAEMCa2ldhtIW/pvNKEVFIZiTZLIBKRu4FHgX1EpFNEztsc5RrTarIKRKq6rNSiUdW1+N1Y+wLD8V/NIvw8LTwfDkxV1UhVH8PvX98HOAm/S+zK0KqZA7SHY9uq6qOhtzS17FqVykhUz7fvN5qqnrU5yjGm1dUzRiQiC2IvJ4Yx1kr5BgCHAI8Du6jqMvjb2G9p9L8vG27RAegMadXSOyukU6WMRJslEBlj0qknEKnqYbXyiMgn8FuB/ZOqvi0iiUVXSIs2In2j2BiRMU0ky/uIRGQLfBD6d1X9eUheHrpVhJ+lachOoF/s9DZgaY30tgrp1cpIZIHImCaSVSAKs2B3As+r6g2xQzOBEeH5COC+WPo5IuJEZAiwJnSvZgPDRGT7MEg9DJgdjq0VkSGhrHPKrlWpjETWNTOmiWT47fujgLOB34vIMyHtW8B4QMOE0atAaXHDWfip+0X46ftzAVR1pYhcC3SEfNeoaulbEqPZMH3/QHhQpYxELoqa9r7Bpq1YV9HZ2Vk7UxNp4Rsa0w78RA8/nOpbUxx77LH1XLfpWYvImCZid1YbY3JngcgYkzsLRMaY3FkgMsbkrqhrVlsgMqaJWIvIGJM7C0TGmNxZIDLG5M4CkTEmdxaIjDG5s0BkjMmdTd8bY3JnLSJjTO4sEBljcmeByBiTOwtExpjcWSAyxuTOApExJnc2fW+MyZ21iIwxubNAZIzJnQUiY0zusgxEIjIZ+CywQlUPCGk7APcAA4DFgKjqqrBJ4gT83mbvAiNV9alwzgjg6nDZcao6JaQPYsO+ZrOAS1U1SiqjWl2LOTJmTJPKcstpfJBoL0u7EpirqgOBueE1wMnAwPAYBdwGfwtcY4DBwBHAmLDjKyHPqNh57TXKSGSByJgmkmUgUtX5wMqy5OHAlPB8CnBaLH2qqkaq+hjQK+xbfxIwR1VXhlbNHKA9HNtWVR9V1QiYWnatSmUksq6ZMU2knul7EVkQezlRVSemOG2XsG89qrpMRHqH9L7Akli+zpBWLb2zQnq1MhJZIDKmidQzRqSqh2VZdIW0aCPSN4p1zYxpIhmPEVWyPHSrCD9XhPROoF8sXxuwtEZ6W4X0amUkskBkTBPZDIFoJjAiPB8B3BdLP0dEnIgMAdaE7tVsYJiIbB8GqYcBs8OxtSIyJMy4nVN2rUplJLKumTFNJOPp+7uBocBOItKJn/0aD6iInAe8Cpwess/CT90vwk/fnwugqitF5FqgI+S7RlVLA+Cj2TB9/0B4UKWMRC6KNrpb12hNW7GuorOzs3amJtK7d29WrKjZym8qbW1tUHk8pZLo1VdfTZWxf//+9Vy36VmLyJgmYndWG2NyZ9++N8bkrqgtIhsjMqbxUo8RLVu2LFXGPn361HPdptfM7UDXSg8ReTLvOnT1Rwt/xqlthun7pmRdM2OaSFcMMmlYIDKmiVggMpsqzRcOzabp8p9xUWfNivmuGyDlN583iYjcJSLjwvOjReSFRpcZyopEZO+EY/NE5PyU11ksIidsZB0WAy9tzLmtxMaITEtR1UeAfWrlE5GRwPmq+umGV8pssq4YZNKwFlFORMT+CJiPsBaR2WSh+3A7cDbQB/gPYLSqviciQ4GfAP8K/DN+pbuzReSzwDhgALAQ+Jqq/i5c7xDgTvwynLOI3VtVup6qtoXX/fBrDh+N/wNzN3AL8G/AFiLyDrBeVXuJSE/gOkCAnsAM4J9V9S/hWt8ALgvlldYqTvP+9wImAQeFc2cDF6rq6li2w0Xkh+WfTzg/8bMoiq4YZNKwFlH2voxfXnMv4JN8+D/yrsAOwO7AKBE5FJgMfBXYER/EZopITxHZEv8f9cfhnHuBf6hUoIh0B34BvIL/T9wXmKaqzwNfAx5V1U+oaq9wyvWhbgcDe4f83wnXagcuB07EB8B6xnQc8C/AbsC++HVsxqb5fKq01WrSAAACy0lEQVR9FnWU3/KsRWSycrOqLgEQkevwLaBSMPorMEZV3w/HLwBuV9XHw/EpIvItYAi+RbEFcFNYE3i6iFyWUOYR+P/831DV9SHtt5UyhrVjLgAOLC3nICLfA34KXIVvJf1IVf8Qjo0FzkrzxlV1EX4ZCYA3ROQG/NITcUmfT7XP4jdpyu8KumKQScMCUfbi6/u+gg8QJW+UuiHB7sAIEbk4lrZlOCcCXgtBKH69SvoBr8SCUDU7Ax8DnhSRUpoDuofnuwFPpijzI8LaxD/Edw+3wbe4y7eRSfp8qn0WhVHU6XsLRNmLL6vZnw3LZ8JHvz+3BLhOVa8rv4iIfAboKyIuFoz6Ay9WKHMJ0F9EelQIRuVlvgn8BdhfVV+rcK1lFd5DWv8SyjtQVd8SkdOAm8vyJH0+iZ9FkViLyGTlQhH5BX6Vu2/hN5pLMgmYISIPAk/gWypDgfnAo8B64BIRuQX4PL4L9nCF6zyBDyDjRWQM8AEwSFX/E1gOtInIlqq6TlX/KiKTgBtF5CJVXSEifYEDVHU2oMCPRGQqfnO88q5VNdsAa4DV4ZrfqJAn6fNJ/CxUdW0ddWhpRQ1ExWwHNtZPgV/jb757CT8LVJGqLsCPjdyM78IsAkaGY+uAL4TXq4AzgJ8nXOcD4HP4gedX8QubnxEOPwQ8B7wuIm+GtG+Gsh4TkbeBBwn3JKnqA8BN4bxF4Wda3wUOxQejXybUt+LnU+2zKJKiDlY38zIgLSdM35+vqg/mXRfTkqL33nuvdi5gq622gjq/2d/MrGtmTBPpiq2dNCwQGdNEihqIrGtmTPOI1q9PcwcG9OjRA6xrZoxphKK2iCwQGdNELBAZY3JX1EBk9xEZ0zxeqeM+otRfvWkFNlhtjMmdtYiMMbmzQGSMyZ0FImNM7iwQGWNyZ4HIGJM7C0TGmNxZIDLG5M4CkTEmdxaIjDG5+//dUTG0NUQ35gAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 288x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.matshow(confusion_matrix(messages['label'], predictions), cmap=plt.cm.binary, interpolation='nearest')\n",
    "plt.title('confusion matrix')\n",
    "plt.colorbar()\n",
    "plt.ylabel('expected label')\n",
    "plt.xlabel('predicted label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      1.00      1.00    720000\n",
      "           1       1.00      1.00      1.00     80000\n",
      "\n",
      "   micro avg       1.00      1.00      1.00    800000\n",
      "   macro avg       1.00      1.00      1.00    800000\n",
      "weighted avg       1.00      1.00      1.00    800000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(messages['label'], predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 转换为 CoreML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import coremltools\n",
    "coreml_model = coremltools.converters.sklearn.convert(spam_detector, \"message\", \"spam_or_not\")\n",
    "\n",
    "#set parameters of the model\n",
    "coreml_model.short_description = \"Classify whether message is spam or not\"\n",
    "coreml_model.input_description[\"message\"] = \"TFIDF of message to be classified\"\n",
    "coreml_model.output_description[\"spam_or_not\"] = \"Whether message is spam or not\"\n",
    "#save the model\n",
    "coreml_model.save(\"result/SpamMessageClassifier.mlmodel\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "input {\n",
       "  name: \"message\"\n",
       "  shortDescription: \"TFIDF of message to be classified\"\n",
       "  type {\n",
       "    multiArrayType {\n",
       "      shape: 366750\n",
       "      dataType: DOUBLE\n",
       "    }\n",
       "  }\n",
       "}\n",
       "output {\n",
       "  name: \"spam_or_not\"\n",
       "  shortDescription: \"Whether message is spam or not\"\n",
       "  type {\n",
       "    int64Type {\n",
       "    }\n",
       "  }\n",
       "}\n",
       "output {\n",
       "  name: \"classProbability\"\n",
       "  type {\n",
       "    dictionaryType {\n",
       "      int64KeyType {\n",
       "      }\n",
       "    }\n",
       "  }\n",
       "}\n",
       "predictedFeatureName: \"spam_or_not\"\n",
       "predictedProbabilitiesName: \"classProbability\"\n",
       "metadata {\n",
       "  shortDescription: \"Classify whether message is spam or not\"\n",
       "}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from coremltools.models import MLModel\n",
    "model = MLModel('result/SpamMessageClassifier.mlmodel')\n",
    "\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "366750\n",
      "366750\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "words_array = json.load(open(\"result/words_array.json\", 'r'))\n",
    "print(len(words_array))\n",
    "\n",
    "words_idf = json.load(open(\"result/words_idf.json\", 'r'))\n",
    "print(len(words_idf[\"idf\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 参考\n",
    "\n",
    "- [Building offline iPhone spam classifier using CoreML](https://medium.com/ymedialabs-innovation/building-offline-iphone-spam-classifier-using-coreml-3552c2beb2b2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
